#Libraries needed
library(tidyverse)
library(ggExtra)
library(tidygraph)
library(ggraph)
library(cowplot)
library(kableExtra)
library(gridExtra)
library(scales)
library(GenomicRanges)

#Chromosome choice for different steps
test_CHR = 6
subset_CHR = "ALL"

#Software paths 
soft_dir = "/home/alice/Software/"
list_dir = "/home/alice/WW_PopGen/Keep_lists_samples/"
gatk_path = paste0(soft_dir, "gatk-4.1.4.1/gatk")
#java_path = "/Library/Java/JavaAppletPlugin.plugin/Contents/Home/bin/java"
bcftools_path = paste0(soft_dir, "bcftools-1.10.2/")

#Directory paths
dir_name = "/data2/alice/WW_project/"
data_dir = "/data2/alice/WW_project/0_Data/"
VAR_dir = paste0(dir_name, "1_Variant_calling/")
  depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
  vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
  vcf_qual_check_dir = paste0(VAR_dir, "5_Quality_checks/")
    
# File paths
TE_annot = paste0(dir_name, "4_TE_RIP/Ursula_IPO323_TE_merged.bed")
vcf_core_name = "Ztritici_global_March2021.genotyped."
test_vcf_name = paste0(vcf_dir, vcf_core_name, test_CHR)
subset_vcf_name = paste0(vcf_dir, vcf_core_name, subset_CHR, ".filtered.clean.AB_filtered.variants")
ref_genome = paste0(dir_name, "0_Data/", "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")


#This is a file that contains the name of all samples I am not interested in
# for example Michael Habig's mutants and other resequencing of IPO323.
unwanted_samples=read_tsv(paste0(data_dir, "Samples_to_filter_out.args"), col_names = F) %>% pull()

Sys.setenv(DIRNAME=dir_name)
Sys.setenv(TESTVCFNAME=test_vcf_name)
Sys.setenv(SUBVCFNAME=subset_vcf_name)
Sys.setenv(REF=ref_genome)
Sys.setenv(GATKPATH=gatk_path)
#Sys.setenv(JAVAPATH=java_path)
Sys.setenv(BCFTOOLSPATH=bcftools_path)


knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)

Pre-step 1 - Filtering based on site: identifying the proper thresholds

Below I first extract the snps and in a second time, I remove all genotypes with a depth that I consider too low for the data to be reliable. This threshold can and should be adapted depending on the dataset. Finally, I use bcftools to extract the values of the different estimates we can filter on.


#Removing indels
${GATKPATH} SelectVariants  \
  -R ${REF} \
  -V ${TESTVCFNAME}.vcf.gz \
 --select-type-to-include SNP \
 -O ${TESTVCFNAME}.snps.vcf

#Filtering on low depth per genotype
#### REMOVED SNPS in input!!
${GATKPATH} VariantFiltration \
 -R ${REF} \
 -V ${TESTVCFNAME}.vcf.gz \
--genotype-filter-expression "DP < 3"  \
--set-filtered-genotype-to-no-call   \
--genotype-filter-name "Low_depth"   \
-O ${TESTVCFNAME}.snps.low_depth.vcf

Here, I clean the file to remove alternate alleles that do not exist anymore because they were only found in the filtered genotyped. And I additionally remove the positions which are non-variant, now that some genotypes are filtered out.

${GATKPATH} SelectVariants  \
  -R ${REF} \
  -V ${TESTVCFNAME}.snps.low_depth.vcf \
 --exclude-non-variants --remove-unused-alternates \
 -O ${TESTVCFNAME}.snps.low_depth.variants.vcf

rm ${TESTVCFNAME}.snps.low_depth.vcf 
#Extracting the data for the figures
${BCFTOOLSPATH}bcftools query -f '%CHROM %POS %MQ %QD %FS %DP %MQRankSum %QUAL %ReadPosRankSum %BaseQRankSum\n'  ${TESTVCFNAME}.snps.low_depth.variants.vcf > ${TESTVCFNAME}.snps.low_depth.variants.info_fields.tab

Now that we have the data extracted, I can make figures and test values for the filtering.

FS_thr = 10
MQ_thr = 20
QD_thr = 20
ReadPosRankSum_thr_high = 2
ReadPosRankSum_thr_low = -2
MQRankSum_thr_high = 2
MQRankSum_thr_low = -2
BaseQRankSum_thr_high = 2
BaseQRankSum_thr_low = -2
  
print(paste0(test_vcf_name, ".snps.low_depth.variants.info_fields.tab"))
## [1] "/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.6.snps.low_depth.variants.info_fields.tab"
info = read_delim(paste0(test_vcf_name, ".snps.low_depth.variants.info_fields.tab"),
                  delim = " ", col_names =  c("CHROM", "POS", "MQ", "QD", "FS", "DP",
                                              "MQRankSum", "QUAL", "ReadPosRankSum", "BaseQRankSum"), 
                  na = ".")
temp = info %>% 
  gather(key = "info_field", value = "value", -CHROM, -POS) %>% 
  mutate (Filter = "Unfiltered")

temp2 = info  %>% filter(FS < FS_thr & MQ > MQ_thr & QD > QD_thr) %>% 
  filter(between(MQRankSum, MQRankSum_thr_low, MQRankSum_thr_high) | is.na(MQRankSum)) %>% 
  filter(between(BaseQRankSum, BaseQRankSum_thr_low, BaseQRankSum_thr_high) | is.na(BaseQRankSum)) %>% 
  filter(between(ReadPosRankSum, ReadPosRankSum_thr_low, ReadPosRankSum_thr_high) | is.na(ReadPosRankSum)) %>% 
  filter(QUAL > 500) %>%
  gather(key = "info_field", value = "value", -CHROM, -POS) %>% 
  mutate (Filter = "Filtered")

p = ggplot(temp, aes(value)) + geom_density(fill = "#CBF3F0") 
p + facet_wrap(info_field~., scales="free")  + theme_classic() 

p2 = ggplot(temp2, aes(value)) + geom_density(fill = "#2EC4B6")
p2 + facet_wrap(info_field~., scales="free")  + theme_classic()

dim(temp %>% select(CHROM, POS) %>% group_by_all %>% count)
## [1] 549121      3
dim(temp2 %>% select(CHROM, POS) %>% group_by_all %>% count) 
## [1] 426127      3

The values obtained above can be used in a script to run directly on the cluster for all chromosomes.

Pre-step 2 - Filter checks using resequencing and added layer of position filtering

Types of variants, types of errors and effect of step 1 filtering on errors

I want to assess the effect of the filtering on the SNP data. For this, I use four columns from the filtered and unfiltered vcf: chromosome, position, reference allele and alternative allele. This allows me to count the variants, find where they are, and what type of variants they are, in terms of both number of alleles and whether we are looking at SNPs or indels.

var_list = bind_rows(paste0(vcf_qual_check_dir, vcf_core_name, c(1:21), ".tab") %>%
                         map_df(~read_tsv(., col_names = c("CHROM", "POS", "REF", "ALT"), 
                           col_types = cols(col_character(), col_double(), col_character(), col_character()))) %>%
                         mutate(Filter = "1_Raw_variants"),
                     paste0(vcf_qual_check_dir, vcf_core_name, c(1:21), ".filtered.clean.tab") %>%
                         map_df(~read_tsv(., col_names = c("CHROM", "POS", "REF", "ALT"), 
                           col_types = cols(col_character(), col_double(), col_character(), col_character()))) %>%
                         mutate(Filter = "2_Filtered_on_quality") ) %>% 
  mutate(Nb_allele = str_count(ALT, pattern = ",") + 2) %>%
  mutate(Nb_letter = str_count(ALT, pattern = "[A-Z]") + str_count(REF, pattern = "[A-Z]")) %>%
  mutate(Indel = ifelse((str_count(ALT, pattern = "[*]") + str_count(REF, pattern = "[*]")) > 0, "Indel", 
                        ifelse(Nb_letter != Nb_allele, "Indel", "SNP"))) %>%
  unite(col = "Var_type", Indel, Nb_allele, sep = "_", remove = F)

kable(var_list  %>%
  group_by(Filter, Var_type) %>%
  count() %>% 
  pivot_wider(names_from = Var_type, values_from = n) %>%
    mutate(Total = sum(Indel_2, Indel_3, SNP_2, SNP_3)) %>%
    t())
Filter 1_Raw_variants 2_Filtered_on_quality
Indel_2 571489 503076
Indel_3 2367140 2051124
SNP_2 6540378 5789157
SNP_3 650787 579132
Total 10129794 8922489
variants_per_step1 = var_list  %>%
group_by(CHROM, Filter, Var_type) %>%
count()  %>%
ggplot(aes(x=as.numeric(CHROM), y =n, fill = Var_type)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(vars(Filter)) +
  coord_flip() +
  labs(title = "Variants number before and after filtering on position quality", 
       x = "Chromosome", y = "Number of variants", 
       fill = "Variant type") +
  theme_minimal() +
  scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6"))
variants_per_step1

We have 8 isolates that were resequenced several times in different datasets. Using these, we can estimate the amount of false positive that happen and the type of variants that are more susceptible to be affected by such errors.

reseq_list = read_delim(paste0(list_dir, "Repeat_sequencing.txt"), delim =" ", col_names = c("Name_1", "Name_2"))


#Read data for all chromosomes and for each pair of samples
list_diff = list()
for (sample in c(1:length(reseq_list$Name_1))) {

list_filtered = list.files(path = vcf_qual_check_dir, 
                           pattern = paste0("filtered.clean.", reseq_list$Name_1[sample], ".tab"), 
                           full.names = T) 
list_unfiltered = setdiff(list.files(path = vcf_qual_check_dir, 
                                     pattern = paste0(reseq_list$Name_1[sample], ".tab"), 
                                     full.names = T),
                     list.files(path = vcf_qual_check_dir, pattern = paste0("filtered.clean.", reseq_list$Name_1[sample], ".tab"), 
                                full.names = T)) 

list_unfiltered = list_unfiltered[!str_detect(list_unfiltered, pattern=".ALL.")]

filtered = list_filtered %>% 
    map_df(~read_tsv(., col_names = c("CHROM", "POS", "REF", "ALT", "Clone1", "Clone2"), 
                        col_types = cols(col_character(), col_double(), col_character(), col_character(), col_character(), col_character()))) %>%
    mutate(Filter = "2_Filtered_on_quality") 

unfiltered = list_unfiltered %>% 
    map_df(~read_tsv(., col_names = c("CHROM", "POS", "REF", "ALT", "Clone1", "Clone2"), 
                        col_types = cols(col_character(), col_double(), col_character(), col_character(), col_character(), col_character()))) %>%
    mutate(Filter = "1_Raw_variants") 


head(unfiltered)
list_diff[[sample]] = bind_rows(filtered, unfiltered) %>% mutate(Sample = reseq_list$Name_1[sample]) 
}

# Gather every table previously read in one single dataframe
# and transform to estimate the number of allele and whether the position is only SNPs or has at least one indel
err_var_list = bind_rows(list_diff) %>% 
  mutate(Nb_allele = str_count(ALT, pattern = ",") + 2) %>%
  mutate(Nb_letter = str_count(ALT, pattern = "[A-Z]") + str_count(REF, pattern = "[A-Z]")) %>%
  mutate(Indel = ifelse((str_count(ALT, pattern = "[*]") + str_count(REF, pattern = "[*]")) > 0, "Indel", 
                        ifelse(Nb_letter != Nb_allele, "Indel", "SNP"))) %>%
  unite(col = "Var_type", Indel, Nb_allele, sep = "_", remove = F) 

rm(list_diff)

I not only want to know how many loci contain genotyping errors but also what type of variants these are. So I will make some more plots to figure this out.

# Bar plot of variant type per resequencing pair
p1 = err_var_list  %>%
group_by(Filter, Sample, Var_type) %>%
count()  %>%
ggplot(aes(x=Sample, y =n, fill = Var_type)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(vars(Filter)) +
  coord_flip()+
  theme_minimal() +
  scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6"))

# Bar plot of variant type per chromosome
p2 = err_var_list  %>%
  select(CHROM, POS, Filter, Var_type) %>%
  distinct() %>%
group_by(CHROM, Filter, Var_type) %>%
count()  %>%
ggplot(aes(x=as.numeric(CHROM), y =n, fill = Var_type)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_grid(cols = vars(Filter)) +
  theme_minimal() +
  scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6"))

legend_b <- get_legend(
  p1 + 
    guides(color = guide_legend(nrow = 1)) +
    theme(legend.position = "bottom") +
    labs(Fill = "Variant type")
)

prow = plot_grid( p1 + theme(legend.position="none", axis.title = element_blank(), plot.margin = margin(r = 10)),
               p2 + theme(legend.position="none", axis.title = element_blank(), plot.margin = margin(r = 10)),
               nrow = 1, rel_widths = c(1, 0.8), labels = c("A", "B"))
plot_grid(prow, legend_b, rel_heights = c(1, 0.1), ncol = 1)

Let’s visualize both datasets together so we can compare the proportions of all variants vs errors.

#Bar plots of the comparison
p1 = bind_rows(err_var_list %>%
            unite(col = "Var_type", Indel, Nb_allele, sep = "_")%>%
            select(CHROM, POS, Filter, Var_type) %>%
            distinct() %>% 
            mutate(Dataset = "Erroneous variants"),
          var_list %>% 
            mutate(Dataset = "All variants")) %>%
  group_by(Filter, Var_type, Dataset) %>%
  count()  %>%
  ggplot(aes(x=Filter, y =n, fill = Var_type)) +
     geom_bar(stat = "identity", position = "fill") + 
     facet_grid(rows = vars(Dataset)) +
     labs(x = "", y = "Proportion of variants") +
     theme_minimal()+
     scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6"))



#Prep summary table
t1 = var_list %>% 
  group_by(Filter) %>%
  count()  
t2 = var_list %>% 
  filter(Indel == "SNP") %>%
  group_by(Filter) %>%
  count()   
t3 = err_var_list %>% 
  group_by(Filter) %>%
  count()  
t4 = err_var_list %>% 
  filter(Indel == "SNP") %>%
  group_by(Filter) %>%
  count() 

temp = tibble(Filter_status = c(rep(c("Filtered out", "Kept after filtering"), 4)),
       Variant_type = c(rep(c("All", "All", "SNP", "SNP"), 2)),
       Dataset = c(rep("All variants", 4), rep("Erroneous variants", 4)),
       Nb_variants = c(pull(t1[1, 2] - t1[2, 2]), pull(t1[2, 2]),
                       pull(t2[1, 2] - t2[2, 2]), pull(t2[2, 2]), 
                       pull(t3[1, 2] - t3[2, 2]), pull(t3[2, 2]),
                       pull(t4[1, 2] - t4[2, 2]), pull(t4[2, 2]))) %>%
  group_by(Dataset, Variant_type) %>%
  mutate(Sum = sum(Nb_variants)) %>%
  arrange(., desc(Filter_status), .by_group = T) %>%
  mutate(label_y = cumsum(Nb_variants) - 0.5 * Nb_variants)


#Bar plots of filtered variants/SNPs amongst all and erroneous variants
p2 = ggplot(temp, aes(fill = Filter_status, x = Variant_type, y = Nb_variants)) +
  geom_bar(stat = "identity") +
  theme_bw() + 
  facet_grid(row = vars(Dataset), scales = "free_y")+ 
  scale_fill_manual(values = c("#CBF3F0", "#2EC4B6")) + 
  geom_text(aes(y = label_y, label = percent(round(Nb_variants / Sum, 3)))) +
  labs(x = "Variant type", y = "Number of variants")



# Plot everything together
plot_grid(p1 + theme(legend.position = "bottom", plot.margin = margin(l = 5, r = 10)), 
          p2 + theme(legend.position = "bottom", plot.margin = margin(l = 10)), 
          labels = c("A", "B"), nrow = 1)

Once we have a general idea of the error quantity, I want to look at the variants distribution along the genomes.

t2 = as.data.frame(table(err_var_list %>% 
  filter(Filter == "1_Raw_variants") %>%
  group_by(CHROM, POS, Filter, Var_type) %>% 
  count() %>%
    ungroup() %>%
  select(n)))

t3 = as.data.frame(table(err_var_list %>% 
  filter(Filter == "2_Filtered_on_quality") %>%
  group_by(CHROM, POS, Filter, Var_type) %>% 
  count() %>%
    ungroup() %>%
  select(n)))

nb_shared = sum(t2[as.numeric(t2$Var1) > 1,]$Freq)
nb_all = sum(t2$Freq)
p1 = ggplot(t2, aes(x = Var1, y = Freq, label = Freq)) +
  geom_bar(aes(fill = as.numeric(Var1) > 1), stat = "identity") +
  geom_label(nudge_y = 6000) +
  labs(x = "Number of pairs sharing an erroneous variant locus",
       y = "Number of loci",
       title = "Some erroneous variant loci are shared between samples",
       subtitle = paste0("Pre-filtering number of erroneous variant loci: ", nb_all, "; shared loci: ", nb_shared), 
       fill = str_wrap("To remove in subsequent filtering", 20)) +
  theme_minimal() +
  scale_fill_manual(values = c("#ff9f1c", "#FFD399")) 



nb_shared = sum(t3[as.numeric(t3$Var1) > 1,]$Freq)
nb_all = sum(t3$Freq)
p2 = ggplot(t3, aes(x = Var1, y = Freq, label = Freq)) +
  geom_bar(aes(fill = as.numeric(Var1) > 1), stat = "identity") +
  geom_label(nudge_y = 3000) +
  labs(x = "Number of pairs sharing an erroneous variant locus",
       y = "Number of loci",
       subtitle = paste0("Post-filtering number of erroneous variant loci: ", nb_all, "; shared loci: ", nb_shared), 
       fill = str_wrap("To remove in subsequent filtering", 20)) +
  theme_minimal() +
  scale_fill_manual(values = c("#ff9f1c", "#FFD399")) 

legend_b <- get_legend(
  p1 + 
    guides(color = guide_legend(nrow = 1)) +
    theme(legend.position = "bottom") +
    labs(Fill = "Variant type")
)

plot_grid(p1 + theme(legend.position = "none"), 
          p2 + theme(legend.position = "none"), 
          ncol = 1, rel_widths = c(1, 1.1, 0.1))

# Barplot with layering of all and filtered only variants
ggplot(t3, aes(x = Var1, y = Freq, label = Freq, fill = as.numeric(Var1) > 1)) +
  geom_bar(data = t2, aes(x = Var1, y = Freq, label = Freq, fill = as.numeric(Var1) > 1), stat = "identity", alpha = 0.7) +
  geom_bar(stat = "identity", alpha = 1) +
  geom_label(nudge_y = 6000, fill = "white") +
  labs(x = "Number of pairs sharing an erroneous variant locus",
       y = "Number of loci",
       title = "Some erroneous variant loci are shared between samples",
       subtitle = paste0("Post-filtering number of erroneous variant loci: ", nb_all), 
       fill = str_wrap("Filtered out", 20)) +
  theme_minimal() +
  scale_fill_manual(values = c("#FF9F1C", "#FF9F1C")) 

# Printing of the erroneous variant table shared by at least 2 samples 
err_var_list %>% 
  filter(Filter == "2_Filtered_on_quality") %>%
  group_by(CHROM, POS, Filter, Var_type) %>% 
  count() %>% ungroup %>% 
  filter(n > 1) %>%
  select(CHROM, POS) %>%
  write_tsv(paste0(vcf_qual_check_dir, "Erroneous_positions_from_resequencing.tsv"), col_names = F)

We already know that some erroneous variants are actually found in multiple resequencing pairs. But even more generally, are errors found in the same area of the genomes for all resequencing pairs?

#Let's first create windows along the whole genome 
chromosomes = read_tsv(paste0(ref_genome, ".fai"),
                        col_names = c("CHROM", "Length", "Byte_index", "Base_per_line", "Bytes_per_line")) 
win_size = 10000

list_temp = list()
for (chr in c(1:length(chromosomes$CHROM))) {
  list_temp[[chr]] = bind_rows(tibble(CHROM = as.character(chr), 
                            Window = seq(from = 0, to = chromosomes$Length[14], by = win_size), 
                            Filter = "1_Raw_variants"),
                            tibble(CHROM = as.character(chr), 
                            Window = seq(from = 0, to = chromosomes$Length[14], by = win_size), 
                            Filter = "2_Filtered_on_quality"))

}

windows = (bind_rows(list_temp))


# Transforming the tables with the differences to match the windows
#bind_rows(list_diff) %>% 
temp = err_var_list %>% 
  mutate(Window = win_size*round(POS / win_size)) %>%
  group_by(CHROM, Filter, Sample, Window) %>%
  count() %>%
  pivot_wider(names_from = Sample, values_from = n) %>%
  full_join(., windows) %>%
  pivot_longer(-c(CHROM, Filter, Window), names_to = "Sample", values_to = "Nb_SNP") %>%
  mutate(Nb_SNP = replace_na(Nb_SNP, 0))

# And finally, plotting the results
temp %>%
  filter(Filter == "1_Raw_variants") %>%
  filter(as.integer(CHROM) <= 6) %>% 
ggplot(., aes(x = Window, y = Nb_SNP, col = Sample)) +
  geom_line(alpha = .4) +
  geom_point(alpha = .4) + 
  theme_minimal() + 
  facet_grid(rows = vars(as.integer(CHROM)), cols = vars(Filter))

temp %>%
  filter(as.integer(CHROM) <= 6) %>% 
ggplot(., aes(x = Window, y = Nb_SNP, col = Sample)) +
  geom_line(alpha = .4) +
  theme_minimal() + 
  facet_grid(rows = vars(as.integer(CHROM)), cols = vars(Filter)) +
  theme(legend.position = "none")

TE_annot_t = read_tsv(TE_annot, col_names = c("CHROM", "Start", "Stop")) #%>% 
#                separate(temp, into = c("whatever", "CHROM"), sep = "_") %>%
#                select(-whatever) 

temp %>%
  filter(Filter == "2_Filtered_on_quality") %>%
  filter(as.integer(CHROM) == 4) %>% 
ggplot(data = .)  + 
  geom_rect(data=TE_annot_t %>% filter(as.integer(CHROM) == 4), 
            aes(xmin=Start, xmax=Stop, ymin=0, ymax=200), 
            color = "#FFEDD6", fill = "#FFEDD6", alpha=0.5)+
  geom_line(aes(x = Window, y = Nb_SNP, col = Sample), alpha = .4) +
  theme_minimal() + 
  facet_grid(rows = vars(as.integer(CHROM)), cols = vars(Filter), scales = "free_x") +
  theme(legend.position = "none") 

From the graph, it does look like there is

unfilterrorsGR = makeGRangesFromDataFrame(err_var_list %>% 
                                      filter(Filter == "1_Raw_variants") %>%
                                      select(chr = CHROM, start = POS, end = POS) %>% 
                                      unique())  
TEGR = makeGRangesFromDataFrame(TE_annot_t %>% 
                                  #mutate(Start = Start - 100, Stop = Stop + 100) %>%
                                  select(chr = CHROM, start = Start, end = Stop))  
errorsGR = makeGRangesFromDataFrame(err_var_list %>% 
                                      filter(Filter == "2_Filtered_on_quality") %>%
                                      select(chr = CHROM, start = POS, end = POS) %>% 
                                      unique())  

cat("Total number of unfiltered errors:", sum(width(unfilterrorsGR)), "\n")

Total number of unfiltered errors: 96451

cat("Total number of errors post-filtering:", sum(width(errorsGR)), "\n")

Total number of errors post-filtering: 51040

cat("Total length of TEs:", sum(width(TEGR)), "\n")

Total length of TEs: 7515661

cat("Error number of errors post-filtering in TEs:", sum(width(intersect(errorsGR, TEGR))) , "\n")

Error number of errors post-filtering in TEs: 23751

cat("Error number of errors post-filtering outside of TEs:", sum(width(setdiff(errorsGR, TEGR))), "\n")

Error number of errors post-filtering outside of TEs: 27289

cat /data2/alice/WW_project/1_Variant_calling/5_Quality_checks/Ztritici_global_March2021.genotyped.*.filtered.clean.frq.count | grep -v "CHROM" > /data2/alice/WW_project/1_Variant_calling/5_Quality_checks/Ztritici_global_March2021.genotyped.concat.filtered.clean.frq.count.concat
counts =read_tsv("/data2/alice/WW_project/1_Variant_calling/5_Quality_checks/Ztritici_global_March2021.genotyped.concat.filtered.clean.frq.count.concat", 
                     col_names = c("CHROM", "POS", "N_ALLELES", "N_CHR", "Count1", "Count2", "Count3"), 
                     col_types = cols(col_character(), col_double(), col_double(), col_double(), 
                                      col_double(), col_double(), col_double())) %>%
  rowwise() %>%
  mutate(minorAC = min(Count1, Count2, Count3, na.rm = T), 
         majorAC = max(Count1, Count2, Count3, na.rm = T),
         freq = minorAC/N_CHR)


temp = err_var_list %>% 
 # filter(Filter == "2_Filtered_on_quality") %>%
  select(CHROM, POS) %>% 
  unique() %>% 
  left_join(., counts) %>%
  mutate(Variant_type = "Erroneous")

bind_rows(temp, counts %>% mutate(Variant_type = "All_variants"))  %>%
ggplot(aes(x = freq, fill = Variant_type, col = Variant_type)) +
  geom_density(alpha =.4) +
  scale_fill_manual(values = c("#FF9F1C", "#2EC4B6")) +
  scale_color_manual(values = c("#FF9F1C", "#2EC4B6"))+
  theme_minimal()

ggplot(temp, aes(x = freq)) +
  geom_density(fill ="#2EC4B6", col = "#2EC4B6") +
  theme_minimal()

Set up allelic balance filter

We clearly have way too many errors after the first round of filtering. Another possible cause of errors could be that the intrinsic machinery of GATK is made for diploids and not for haploids. How does it treat the positions with reads giving conflicting information that would be called heterozygous positions in a diploid?

AB_threshold = 0.9

#Read data for all chromosomes and for each pair of samples
list_AD = list()
for (sample in c(1:length(reseq_list$Name_1))) {

list_AD[[sample]] = list.files(path = vcf_qual_check_dir, pattern = paste0("filtered.clean.", reseq_list$Name_1[sample], ".AD.tab"), 
                               full.names = T) %>% 
    map_df(~read_tsv(., col_names = c("CHROM", "POS", "REF", "ALT", "GT1", "AD1", "GT2", "AD2"), 
                        col_types = cols(col_character(), col_double(), col_character(), col_character(), 
                                         col_character(), col_character(), col_character(), col_character()))) %>%
    mutate(Filter = "2_Filtered_on_quality")  %>% 
    mutate(Sample = reseq_list$Name_1[sample]) 
}

# Gather every table previously read in one single dataframe
# and transform to estimate the number of allele and whether the position is only SNPs or has at least one indel
err_AD_list = bind_rows(list_AD) %>%
  separate(AD1, into = c("AD1_0", "AD1_1", "AD1_2"), sep = ",", remove = F, convert = T) %>%
  separate(AD2, into = c("AD2_0", "AD2_1", "AD2_2"), sep = ",", remove = F, convert = T) %>%
  rowwise() %>%
  mutate(AB1 = max(AD1_0, AD1_1, AD1_2, na.rm = T)/sum(AD1_0, AD1_1, AD1_2, na.rm = T),
         AB2 = max(AD2_0, AD2_1, AD2_2, na.rm = T)/sum(AD2_0, AD2_1, AD2_2, na.rm = T),
         min_AD = min(max(AD1_0, AD1_1, AD1_2, na.rm = T), max(AD2_0, AD2_1, AD2_2, na.rm = T))) %>%
  mutate(Allelic_balance = ifelse(AB1 > AB_threshold && AB2 > AB_threshold, "Expected AB", "Unbalanced AB")) %>% 
  mutate(Nb_allele = str_count(ALT, pattern = ",") + 2) %>%
  mutate(Nb_letter = str_count(ALT, pattern = "[A-Z]") + str_count(REF, pattern = "[A-Z]")) %>%
  mutate(Indel = ifelse((str_count(ALT, pattern = "[*]") + str_count(REF, pattern = "[*]")) > 0, "Indel", 
                        ifelse(Nb_letter != Nb_allele, "Indel", "SNP"))) %>%
  unite(col = "Var_type", Indel, Nb_allele, sep = "_", remove = F) 

#Illustrating threshold relevance
density_AB = err_AD_list %>%
  select(AB1, AB2) %>%
  pivot_longer(cols = c(AB1, AB2), names_to = "Whatever", values_to = "Allelic_balance") %>%
  ggplot(aes(Allelic_balance)) + 
    geom_density(fill = "grey80") + 
    theme_cowplot() +
    geom_vline(xintercept = AB_threshold, col = "goldenrod") +
    labs(y = element_blank(), x = "AB and AB threshold") + 
  theme(axis.title.x = element_text(size = 10),
        axis.text = element_text(size = 10))


#Counting the effect of the filter on the errors
err_AD_list %>%
  group_by(Var_type, Allelic_balance) %>%
  dplyr::count() %>%
  pivot_wider(names_from = Allelic_balance, values_from = n)
bars_AD = err_AD_list %>%
  group_by(Filter, Var_type, Allelic_balance) %>%
  count() %>%
  ggplot(aes(x=Allelic_balance, y =n, fill = Var_type)) +
     geom_bar(stat = "identity") +
     labs(x = "Allelic balance", y = "Proportion of variants") +
     theme_cowplot()+
     scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6", "grey"))

ggdraw(bars_AD + theme(legend.position = "left")) + 
  draw_plot(density_AB, x = 0.73, y = 0.62, width = 0.26, height = 0.38)

#Numbers
errorsAB = makeGRangesFromDataFrame(err_AD_list %>% 
                                      filter(Allelic_balance == "Expected AB")  %>%
                                      select(chr = CHROM, start = POS, end = POS) %>% 
                                      unique())  

cat("Total number of unfiltered errors:", sum(width(unfilterrorsGR)), "\n",
    "Total number of errors post-quality filtering:", sum(width(errorsGR)), "\n",
    "Total number of errors post-AB filtering :", sum(width(errorsAB)), "\n",
    "----\n",
    "Total length of TEs:", sum(width(TEGR)), "\n",
    "Error number of errors post-quality filtering in TEs:", sum(width(intersect(errorsGR, TEGR))) , "\n",
    "Error number of errors post-quality filtering outside of TEs:", sum(width(setdiff(errorsGR, TEGR))), "\n",
    "Error number of errors post-AB filtering in TEs:", sum(width(intersect(errorsAB, TEGR))) , "\n",
    "Error number post-AB filtering outside of TEs:", sum(width(setdiff(errorsAB, TEGR))), "\n")
## Total number of unfiltered errors: 96451 
##  Total number of errors post-quality filtering: 51040 
##  Total number of errors post-AB filtering : 26578 
##  ----
##  Total length of TEs: 7515661 
##  Error number of errors post-quality filtering in TEs: 23751 
##  Error number of errors post-quality filtering outside of TEs: 27289 
##  Error number of errors post-AB filtering in TEs: 7419 
##  Error number post-AB filtering outside of TEs: 19159
#Figure
All_P = bind_rows(err_var_list, 
          err_AD_list %>% 
            filter(Allelic_balance == "Expected AB") %>%
            mutate(Filter = "3_Filtered_on_AB")) %>%
  select(Var_type, Filter, POS, CHROM) %>%
  distinct() %>%
  ggplot(aes(x=Filter, fill = Var_type)) +
     geom_bar(position = "dodge") + 
     labs(x = element_blank(), 
          y = "Number of erroneous variants",
       fill = "Variant type") +
     theme_minimal() +
     scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6", "grey"))


SNP_P = bind_rows(err_var_list, 
          err_AD_list %>% 
            filter(Allelic_balance == "Expected AB") %>%
            mutate(Filter = "3_Filtered_on_AB")) %>%
  filter(Var_type == "SNP_2") %>%
  select(Var_type, Filter, POS, CHROM) %>%
  distinct() %>%
   ggplot(aes(x=Filter, fill = Var_type)) +
     geom_bar(position = "dodge") + 
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = ..count..),position=position_stack(vjust=0.5)) + 
     labs(x = element_blank(), y = "Number of erroneous variants") +
     theme_minimal() +
     scale_fill_manual(values = c("#cbf3f0", "#2ec4b6", "grey")) +
  theme(legend.position = "none")

cowplot::plot_grid(All_P + coord_flip() + theme(legend.position = "left"), 
                   SNP_P + coord_flip(), 
                   rel_widths = c(1, 0.8), labels = c("A", "B"))

# 
p1 = err_AD_list %>%
  filter(Allelic_balance == "Expected AB") %>%
  group_by(Var_type, Filter, POS, CHROM) %>%
  summarize (MIN_AD = min(min_AD)) %>%
  filter(MIN_AD < 20) %>%
  ggplot(aes(x=MIN_AD, fill = Var_type)) +
  geom_histogram(bins = 19) +
     theme_minimal() +
     scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6", "grey")) +
  labs(x = "Minimum allelic depth", 
       y = "Variant count",
       fill = "Variant type")

p2 = err_AD_list %>%
  filter(Allelic_balance == "Expected AB") %>%
  group_by(Var_type, Filter, POS, CHROM) %>%
  summarize (MIN_AD = min(min_AD)) %>%
  ungroup() %>%
  mutate(MIN_AD = ifelse(MIN_AD >= 10, "+10", MIN_AD)) %>%
  filter(Var_type == "SNP_2") %>%
  ggplot(aes(x=MIN_AD, fill = Var_type)) +
  geom_histogram(bins = 20, stat="count") +
     theme_minimal() +
     scale_fill_manual(values = c("#cbf3f0")) +
  labs(x = "Minimum allelic depth", 
       y = "Variant count",
       fill = "Variant type")+ 
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = ..count..),position=position_stack(vjust=0.5))

cowplot::plot_grid(p1 + theme(legend.position = "top"), 
                   p2 + theme(legend.position = "none"), 
                   labels = c("A", "B"), ncol = 1)

Pre-step 3 - Filtering based on individuals

First, let’s look at missing data and depth. Samples with a large amount of missing data are increasing the amount of positions with missing data (which will be filtered out in some instances) for nothing. So I would rather remove some samples completely rather than a higher number of positions for all samples.

info_per_ind = full_join(read_tsv(paste0(subset_vcf_name, ".imiss")), 
                         read_tsv(paste0(subset_vcf_name, ".idepth"))) %>%
  filter(!(INDV %in% unwanted_samples))

NA_thr = 0.2
depth_thr = 6
p = info_per_ind %>% 
  mutate(to_remove = F_MISS >= NA_thr | MEAN_DEPTH <= depth_thr) %>%
ggplot(aes(x = F_MISS, y = MEAN_DEPTH, col = to_remove)) + 
  geom_point(alpha = 0.6) +
  geom_vline(aes(xintercept = NA_thr), col = "#ffa62b", linetype = "dashed") +
  geom_hline(aes(yintercept = depth_thr), col = "#ffa62b", linetype = "dashed") + 
  theme_classic() +
  scale_color_manual(values = c("black", "grey")) +
  labs(title = "Missing data and depth in vcf",
       subtitle = str_wrap(paste0("There are ", sum(info_per_ind$F_MISS >= NA_thr | info_per_ind$MEAN_DEPTH <= depth_thr), 
                         " samples with too much missing data and ", 
                         sum(info_per_ind$F_MISS < NA_thr & info_per_ind$MEAN_DEPTH > depth_thr), " that are fine."), 
                         width = 60),
       col = "Filtered sample", 
       y = "Mean depth accross core chromosome variants",
       x = "Missing fraction of core chromosome variants") +
  theme(legend.position = "bottom")
p1 <- ggMarginal(p, type="histogram", size = 2)
p1

samples_to_filter = info_per_ind %>% 
  filter(F_MISS >= NA_thr | MEAN_DEPTH <= depth_thr) %>%
  select(INDV) %>% 
  pull()
info_per_ind %>% 
  filter(F_MISS >= NA_thr | MEAN_DEPTH <= depth_thr) %>%
  select(INDV) %>%
  write_tsv(paste0(data_dir, "Sample_with_too_much_NA.args"), col_names = F)

The samples with too much missing data or low depth (in grey, on the right of the vertical gold or below the horizontal gold line) will be removed. These samples happen to come in large part from one particular collection: among the 135 samples, 92 (68%) are from the Swiss collection of 2016.

ibs = read_tsv(paste0(subset_vcf_name, ".mibs.id"), col_names = c("Whatever", "ID")) %>% select(ID) %>% pull()
related = read_tsv(paste0(subset_vcf_name, ".mibs"), col_names = ibs) %>%
  mutate(ID1 = ibs) %>%
  pivot_longer(names_to = "ID2", values_to = "Relatedness", cols = -ID1) 

p1 = related %>%
  filter(ID1 != ID2) %>%
  ggplot(aes(x = Relatedness)) +
  geom_density() + 
  geom_vline(aes(xintercept = 0.9)) +
  theme_minimal()



related = related %>% mutate(Bounded_relatedness = ifelse(Relatedness < 0.9, 0.9, Relatedness))

p2 = related %>%
  filter(ID1 != ID2) %>%
  ggplot(aes(x = Bounded_relatedness)) +
  geom_density() + 
  geom_vline(aes(xintercept = 0.9), col = "grey") + 
  geom_vline(aes(xintercept = 0.99), col = "#ffa62b") +
  theme_minimal()

plot_grid(p1, p2)

Now, I want to represent the relatedness measure: once with all samples (heatmap A), and another including only the sample which have an IBS of at least 0.99 with any other sample (heatmap B). I am only plotting the bounded relatedness as defined by the yellow line in the graph above.

# All samples
p1 = ggplot(related, aes(x = ID1, y = ID2, fill = Bounded_relatedness)) +
  geom_tile() +
  scale_fill_gradient(high = "white", low = "#16697a") +
  #labs(title = "Pairwise IBS across all samples") +
  theme(axis.text = element_blank())


# Only highly related samples
temp = related %>% 
  filter(Relatedness >= 0.99) %>%
  filter(ID1 != ID2) %>%
  filter(!(ID1 %in% samples_to_filter)) %>%
  filter(!(ID2 %in% samples_to_filter))
vector_temp = c(temp %>% select(ID1) %>% pull(),
        temp %>% select(ID2) %>% pull())
table_temp = as.data.frame(table(vector_temp))
length(unique(vector_temp))
## [1] 210
p2 = related %>% 
  filter(!(ID1 %in% samples_to_filter)) %>%
  filter(!(ID2 %in% samples_to_filter)) %>%
  filter(ID1 %in% vector_temp) %>%
  filter(ID2 %in% vector_temp) %>%
  ggplot(aes(x = ID1, y = ID2, fill = Relatedness)) +
  geom_tile() +
  scale_fill_gradient(high = "white", low = "#16697a")+
  theme(axis.text = element_blank())


plot_grid(p1 + theme(legend.position = "none"), p2, 
          rel_widths = c(1, 1.3), labels = c("A", "B"))

From the heatmaps, we can already tell that there are some samples which are clones or quasi-clones from one another. Some seem to go only two by two (mistaken resequencing of the same clone or isolation of two isolates from the same lesion, for example). But there also seem to be larger groups of samples, particularly visible in the heatmap B.

It is not trivial to get groups of clones by looking at the heatmaps or a simple table. Instead, I choose here to use networks and find “communities” with a simplistic cut-off approach.

#for the representation, I first filter the samples to remove all samples with only one connection 
# (so they appear twice in the table)
nodes <- table_temp %>% 
  filter(Freq > 2) %>%
  select(label = vector_temp) %>% 
  rowid_to_column("id") 

#Here I'm renaming the samples with the node number and filtering
edges <- temp %>% 
  filter(ID1 %in% nodes$label) %>%
  filter(ID2 %in% nodes$label) %>%
  select(-Bounded_relatedness) %>%
  left_join(nodes, by = c("ID1" = "label")) %>% 
  mutate(from = id) %>% select(-id)

edges <- edges %>% 
  left_join(nodes, by = c("ID2" = "label")) %>% 
  mutate(to = id) %>%
  select(from, to, weight = Relatedness )

#I can then transform the data in a tbl_graph
#I also want to color the nodes based on the component

routes_tidy <- tbl_graph(nodes = nodes, edges = edges, directed = F)

ggraph(routes_tidy) + 
  geom_edge_link(color = "gray20", alpha = 0.3) + 
  geom_node_point(aes(color = as.character(group_components())), show.legend = F, size = 3) +
  theme_graph() #+ facet_nodes(~ group_components())

The colors in the figure above is representing the “components” (or communities/subgraphs?) automatically detected after using a threshold value. We can see that the components are indeed identified as expected. This is the information I need since I would like to filter keeping only one sample per group. Now, I want to do this for all samples, not just the ones with more than one edge.

nodes <- table_temp %>%
  select(label = vector_temp) %>% 
  rowid_to_column("id") 

#Here I'm renaming the samples with the node number and filtering
edges <- temp %>% 
  filter(ID1 %in% nodes$label) %>%
  filter(ID2 %in% nodes$label) %>%
  select(-Bounded_relatedness) %>%
  left_join(nodes, by = c("ID1" = "label")) %>% 
  mutate(from = id) %>% select(-id)

edges <- edges %>% 
  left_join(nodes, by = c("ID2" = "label")) %>% 
  mutate(to = id) %>%
  select(from, to, weight = Relatedness )

#I can then transform the data in a tbl_graph
#I also want to color the nodes based on the component

routes_tidy <- tbl_graph(nodes = nodes, edges = edges, directed = F)
#Once I know it works as I expect, I get the group information for all samples !
routes_tidy <- tbl_graph(nodes = nodes, edges = edges, directed = F) %>%
  mutate(group = group_components())

#I extract the group/component information 
# and merge this with the depth and NA data per sample
nodes_with_groups <- routes_tidy %>%
  activate(nodes) %>%
  data.frame() %>%
  left_join(., info_per_ind, by = c("label" = "INDV"))

#Identifying the sample with the lowest amount of missing data per group
nodes_with_groups = nodes_with_groups %>%
  group_by(group) %>%
  mutate(rank  = rank(F_MISS, ties.method = "random")) %>%
  mutate(Filter_based_on_ibs = ifelse(rank == 1, "Keep", "Filter_out"))

#Let's make a pretty doughnut plot just for the fun of it.
round = as.data.frame(table(nodes_with_groups$Filter_based_on_ibs)) %>%
  ggplot(aes(x=2, y=Freq, fill=Var1)) +
  geom_bar(stat="identity") +
  xlim(0.5, 2.5) +
  coord_polar(theta = "y") +
  scale_fill_manual(values = c("#16697a", "#ffa62b")) +
  labs(x=NULL, y=NULL, fill="",
       title = "Filtering based on IBS threshold and depth/NA",
       subtitle = "For each group, the sample with the lowest amount of missing data is kept.") +
  theme_bw() +
  theme(legend.text=element_text(size=10),
        axis.ticks=element_blank(),
        axis.text=element_blank(),
        axis.title=element_blank(),
        panel.grid=element_blank(),
        panel.border=element_blank()) + 
  geom_text(aes(label = Freq), 
               position = position_stack(vjust = 0.5), size = 4)
round

cowplot::plot_grid(ggMarginal(p, type="histogram", size = 2), 
                   round + theme(plot.title = element_text(size = 10), plot.subtitle = element_text(size = 8)), 
                   rel_widths = c(1, 0.6))

#Writing a file to indicate which samples to remove
ibs_filtered = nodes_with_groups %>%
  filter(Filter_based_on_ibs == "Filter_out") %>%
  ungroup() %>%
  select(label) 

# Identify good samples
good_samples = info_per_ind %>% 
  filter(F_MISS < NA_thr & MEAN_DEPTH > depth_thr) %>%
  filter(!(INDV %in% pull(ibs_filtered)))
nodes_with_groups %>%
  write_tsv(paste0(data_dir, "Groups_based_on_IBS.tab"), col_names = F)

#Writing a file to indicate which samples to remove
ibs_filtered %>%
  write_tsv(paste0(data_dir, "Sample_removed_based_on_IBS.args"), col_names = F)

# Write list good samples
good_samples %>%
  select(INDV) %>%
  write_tsv(paste0(list_dir, "Ztritici_global_March2021.genotyped.good_samples.args"), col_names = F)
# Reading in the summarized depth of coverage
core_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_sample_core_chr_summary.q30.txt")) %>%
  mutate(Median_core = Median) %>%
  inner_join(., good_samples %>% select(Sample = INDV))
  
chrom_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_chromosome_summary.q30.txt")) %>%
  left_join(., core_depth %>% select(Sample, Median_core)) %>%
  mutate(Relative_depth = as.numeric(Median)/as.numeric(Median_core)) %>%
  inner_join(., good_samples %>% select(Sample = INDV))

ggplot(data = chrom_depth, aes(Relative_depth)) +
  geom_density(col = "#16697a", fill = "#16697a", alpha =0.4) +
  facet_wrap(vars(as.numeric(CHROM)), scales = "free") +
  theme_classic() +
  geom_vline(xintercept = 0.2, col = "#ffa62b")

# Setting thresholds for presence/absence
Lthres = 0.50
Hthres = 1.50
depth = chrom_depth %>%
  filter(CHROM != "mt") %>%
  dplyr::mutate(Depth_is = ifelse(Relative_depth > Hthres, "High",ifelse(Relative_depth < Lthres, "Low", "Normal"))) %>%
  filter(Depth_is != "Low") %>%
  dplyr::group_by(CHROM) %>%
  dplyr::count() %>% 
  ungroup()

max_samples = max(depth$n)
temp = depth %>% add_row(CHROM = "mt", n = max_samples) %>%
  mutate(Threshold_80 = max_samples - round(n * 0.8)) %>%
  select(CHROM, Nb_present = n, Threshold_80) 
temp
write_tsv(temp, paste0(data_dir, "Missing_data_thresholds.tab"), col_names = F)

Number of variants per step

#Reading the variants stats for the next steps
next_steps = bind_rows(read_tsv(paste0(vcf_qual_check_dir, vcf_core_name, "ALL.filtered.clean.AB_filtered.variants.tab"), 
                   col_names = c("CHROM", "POS", "REF", "ALT")) %>%
            mutate(Filter = "3_Filtered_errors"), 
                   read_tsv(paste0(vcf_qual_check_dir, vcf_core_name, "ALL.filtered.clean.AB_filtered.variants.good_samples.tab"), 
                            col_names = c("CHROM", "POS", "REF", "ALT")) %>%
            mutate(Filter = "4_Filtered_samples")) %>% 
  bind_rows(., read_tsv(paste0(vcf_qual_check_dir, vcf_core_name, "ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.tab"), 
                            col_names = c("CHROM", "POS", "REF", "ALT")) %>%
            mutate(Filter = "5_Filtered_80_NA")) %>%
  mutate(Nb_allele = str_count(ALT, pattern = ",") + 2) %>%
  mutate(Nb_letter = str_count(ALT, pattern = "[A-Z]") + str_count(REF, pattern = "[A-Z]")) %>%
  mutate(Indel = ifelse((str_count(ALT, pattern = "[*]") + str_count(REF, pattern = "[*]")) > 0, "Indel", 
                        ifelse(Nb_letter != Nb_allele, "Indel", "SNP"))) %>%
  unite(col = "Var_type", Indel, Nb_allele, sep = "_", remove = F)
#Tables
t1 = var_list  %>%
  group_by(Filter, Var_type) %>%
  count() %>% 
  pivot_wider(names_from = Var_type, values_from = n) %>%
    mutate(Total = sum(Indel_2, Indel_3, SNP_2, SNP_3)) 

t2 = next_steps  %>%
  group_by(Filter, Var_type) %>%
  count() %>% 
  pivot_wider(names_from = Var_type, values_from = n) %>%
    mutate(Total = sum(Indel_2, Indel_3, SNP_2, SNP_3)) 

bind_rows(t1, t2) %>% kable() 
Filter Indel_2 Indel_3 SNP_2 SNP_3 Total
1_Raw_variants 571489 2367140 6540378 650787 10129794
2_Filtered_on_quality 503076 2051124 5789157 579132 8922489
3_Filtered_errors 498557 1958078 5768341 561543 8786519
4_Filtered_samples 491949 1821794 5578488 514587 8406818
5_Filtered_80_NA 390087 1466947 4452192 431058 6740284
temp = bind_rows(t1, t2) %>%
  pivot_longer(-Filter, names_to = "Var_type", values_to = "Var_count") 

temp %>%
  filter(Var_type != "Total") %>%
  ggplot() +
  geom_bar(aes(x=Filter, y =Var_count, fill = Var_type), stat = "identity", position = "stack") +
  scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6")) +
  theme_minimal() +
  labs(title = "Variants number along the filtering steps", 
       x = "", y = "Number of variants", 
       fill = "Variant type") +
  geom_label(data = filter(temp, Var_type == "Total"), 
            aes(x = Filter, y = Var_count + 500000, label = Var_count))+
  geom_label(data = filter(temp, Var_type == "SNP_2"), 
            aes(x = Filter, y = (Var_count/2) + 500000, label = Var_count))

#Bar plots
variants_per_step1

next_steps  %>%
group_by(CHROM, Filter, Var_type) %>%
count()  %>%
ggplot(aes(x=as.numeric(CHROM), y =n, fill = Var_type)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(vars(Filter)) +
  coord_flip() +
  labs(title = "Variants number after filtering on errors and samples", 
       x = "Chromosome", y = "Number of variants", 
       fill = "Variant type") +
  theme_minimal() +
  scale_fill_manual(values = c("#FFD399", "#ff9f1c", "#cbf3f0", "#2ec4b6"))

counts = list.files(path = vcf_qual_check_dir, 
                    pattern = "filtered.clean.AB_filtered.variants.good_samples.frq.count", 
                    full.names = T)  %>% 
    map_df(~read_tsv(., skip = 1, 
                     col_names = c("CHROM", "POS", "N_ALLELES", "N_CHR", "Count1", "Count2", "Count3"), 
                     col_types = cols(col_character(), col_double(), col_double(), col_double(), 
                                      col_double(), col_double(), col_double()))) %>%
  rowwise() %>%
  dplyr::mutate(minorAC = min(Count1, Count2, Count3, na.rm = T), 
         majorAC = max(Count1, Count2, Count3, na.rm = T),
         freq = minorAC/N_CHR)